***** Is Europe an optimal Political Area?                *****
***** Alberto Alesina, Guido Tabellini & Francesco Trebbi *****

***** Section V.a

*Set up 
clear all
set mem 10g
set matsize 11000
set more off
version 13
*

use "data/EVS_GSS_final.dta", clear

cap mkdir "section5"
cap mkdir "appendix"

** Keep US states
keep if country=="US"
decode us_state, gen(state)
keep if state=="California" | state=="New York" | state=="Texas" | state=="North Carolina" | state=="Ohio" | state=="Michigan" | state=="Illinois" | state=="Florida" | state=="Pennsylvania"
replace state="New_York" if state=="New York"
replace state="North_Carolina" if state=="North Carolina"

** Tab some covariates to use
qui tab marital_status, gen(X_marital_status)
qui tab education, gen(X_education)
qui tab employment, gen(X_employment)
qui tab income_recoded, gen(X_income_recoded)
qui tab size_of_town, gen(X_size_of_town)
qui tab education1, gen(X_education_v)

** Setting waves
gen wave=.
replace wave=1 if year==1981 | year==1982 | year==1983 | year==1984 | year==1986
replace wave=2 if year==1990 | year==1991 | year==1993
replace wave=3 if year==1998 | year==1999 | year==2000
replace wave=4 if year==2006 | year==2008 | year==2009 | year==2010 
drop if wave==.

** Cultural Variables
* Reduced list
global cult Y_obedience Y_trust Y_ideology Y_religious Y_divorce Y_euthan Y_suicide Y_altruism Y_hardwork
* Extended list1
global ecult1 Y_obedience Y_trust Y_ideology Y_religious Y_divorce Y_euthan Y_suicide Y_altruism Y_hardwork Y_redistrib Y_workingfemale Y_careerfemale Y_preschoolmother
* Extended list2
global ecult2 Y_obedience Y_trust Y_ideology Y_religious Y_divorce Y_euthan Y_suicide Y_altruism Y_hardwork Y_redistrib Y_workingfemale Y_careerfemale Y_preschoolmother  Y_private Y_equalize

** Covariates 
* Reduced list
global cov X_age X_sex X_marital_status* X_employment1 X_employment2 X_employment3 X_employment4 X_employment5 X_employment6 X_income_recoded1 X_income_recoded2 X_income_recoded3 X_education_v*
* Extended list1
global ecov1 X_age X_sex X_marital_status* X_employment1 X_employment2 X_employment3 X_employment4 X_employment5 X_employment6 X_income_recoded1 X_income_recoded2 X_income_recoded3 X_size_of_town* X_education_v*
* Extended list2
global ecov2 X_age X_sex X_marital_status* X_employment1 X_employment2 X_employment3 X_employment4 X_employment5 X_employment6 X_income_recoded1 X_income_recoded2 X_income_recoded3 X_size_of_town* X_education1 X_education2 X_education3 X_education4 X_education5 X_education6

global cult $ecult2
global cov $ecov2
global labellist "Extended" 
drop country
gen country=state


**We obtain N people at random from every country-wave
global N = 60
set seed 364011739

foreach var of varlist $cult $cov Y_trust Y_ideology {
	drop if missing(`var')
}
gen random = runiform() 
bysort country wave (random): keep if _n<=$N
foreach var of varlist $cov $cult Y_trust Y_ideology {
	*standardize all variables for equal weighting in distance measures below
	qui sum `var'
	replace `var' = (`var'-r(mean))/r(sd)
}

********************************************************************************
program gaussian_kernel
	#delimit ;
	syntax varlist ,
	[
	idvar(varname)
	
	genfile(string) name1(name) name2(name) scorevar(name)
	]
	;
	#delimit cr
	*
	save rtmp.dta, replace
	keep `idvar' `varlist' 
	/*Set kernel bandwidth equal to the number of dimensions following 
	Hainmuller and Hazlett KRLS Package:
	"By default, the bandwidth is set equal to D (the number of dimensions) which 
	typically yields a reasonable scaling of the distances between observations 
	in the standardized data that is used for the fitting."
	*/
	tempname sigmasq
	qui describe
	scalar `sigmasq' = r(k) - 1
	tempname alpha
	scalar `alpha' = 1/(2*`sigmasq')
	gen const = 1
	preserve
		sort const
		foreach var of varlist `idvar' `varlist'  {
			rename `var' `var'2
		}
		save tmp0.dta, replace
	restore
	*
	sort const
	joinby const using tmp0.dta
	drop const
	foreach var of varlist `varlist' {
		gen double `var'__d = `var' - `var'2
		replace `var'__d = `var'__d^2
	}
	*Euclidean distance of the vectors
	egen double ed = rsum(*__d)
	replace ed = sqrt(ed)
	gen double gk = `alpha'*ed^2
	*get 1-gaussian kernel measure of distance 0= same vectors, 1= maximally distant vectors.
	replace gk = 1-exp(-gk)
	rename gk `scorevar'
	rename `idvar' `name1'
	rename `idvar'2 `name2'
	keep `name1' `name2' `scorevar' 
	save `genfile', replace
	use rtmp.dta, clear
end
********************************************************************************
program cosine
	#delimit ;
	syntax varlist ,
	[
	idvar(varname)
	
	genfile(string) name1(name) name2(name) scorevar(name)
	]
	;
	#delimit cr
	*
	save rtmp.dta, replace
	keep `idvar' `varlist' 
	/**/
	gen const = 1
	preserve
		sort const
		foreach var of varlist `idvar' `varlist'  {
			rename `var' `var'2
		}
		save tmp0.dta, replace
	restore
	*
	sort const
	joinby const using tmp0.dta
	drop const
	foreach var of varlist `varlist' {
		gen double `var'__d = `var'*`var'2
		gen double `var'__2 = `var'^2
		gen double `var'__22= `var'2^2
	}
	
	*Cosine
	egen double ed = rsum(*__d)
	egen double ed2 = rsum(*__2)
	replace ed2 = sqrt(ed2)	
	egen double ed22 = rsum(*__22)
	replace ed22 = sqrt(ed22)
	gen double gk = ed/(ed2*ed22)
	*get cosine measure of distance 0= same vectors, 1= maximally distant vectors.
	replace gk = acos(gk)/_pi
	rename gk `scorevar'
	rename `idvar' `name1'
	rename `idvar'2 `name2'
	keep `name1' `name2' `scorevar' 
	save `genfile', replace
	use rtmp.dta, clear
end
********************************************************************************

** Some utility files for below
preserve
keep persno country wave latitude longitude CapitalLatitude CapitalLongitude
rename persno ROW 
rename country country_row
rename latitude latitude_row
rename longitude longitude_row
rename CapitalLatitude CapitalLatitude_row
rename CapitalLongitude CapitalLongitude_row

sort ROW
save "tmp1r.dta", replace
restore
preserve
keep persno country wave latitude longitude CapitalLatitude CapitalLongitude
rename persno COL 
rename country country_col
rename latitude latitude_col
rename longitude longitude_col
rename CapitalLatitude CapitalLatitude_col
rename CapitalLongitude CapitalLongitude_col
sort COL
save "tmp1c.dta", replace
restore

global i = 1
foreach var of varlist $cult {
reg `var' $cov 
global i = $i + 1
predict res_`var', resid
}


** Generate distance between randomly sampled individuals (paired within each wave and all countries)
forvalues i=4(1)4 {
	preserve
		keep if wave==`i'
		global wave = `i'
		*drop variable if no variance in the sample -messes up inversions as distance is always 0:
		foreach var in $cov $cult {
			qui sum `var'
			global S = r(sd)
			if $S == 0 {
				drop `var'
			}
		}
		
		*x basic socio-economic covariates set
		qui gaussian_kernel $cov, idvar(persno) genfile(tmp_x`i') scorevar(cov_mdist) name1(ROW) name2(COL)
		
		*X extended socio-economic covariates set, only for waves 2 to 4
		if $wave ==2 | $wave ==3 | $wave ==4 {
			qui gaussian_kernel $ecov2, idvar(persno) genfile(tmp_eX`i') scorevar(ecov_mdist) name1(ROW) name2(COL)
		}
		
		*y basic culture
		qui gaussian_kernel $cult, idvar(persno) genfile(tmp_y`i') scorevar(cult_mdist) name1(ROW) name2(COL)
		
		*Y extended culture
		qui gaussian_kernel $ecult2, idvar(persno) genfile(tmp_eY`i') scorevar(ecult_mdist) name1(ROW) name2(COL)
		
		*residuals of y basic culture on covariates 
		qui gaussian_kernel res_*, idvar(persno) genfile(tmp_resy`i') scorevar(res_cult_mdist) name1(ROW) name2(COL)
		
		*trust, ideology, and residuals on covariates
		qui gaussian_kernel Y_trust, idvar(persno) genfile(tmp_xt`i') scorevar(Y_trust_mdist) name1(ROW) name2(COL)
		qui gaussian_kernel Y_ideology, idvar(persno) genfile(tmp_xi`i') scorevar(Y_ideology_mdist) name1(ROW) name2(COL)
		qui gaussian_kernel res_Y_trust, idvar(persno) genfile(tmp_resyt`i') scorevar(res_Y_trust_mdist) name1(ROW) name2(COL)
		qui gaussian_kernel res_Y_ideology, idvar(persno) genfile(tmp_resyi`i') scorevar(res_Y_ideology_mdist) name1(ROW) name2(COL)
		
		restore
}

** Change the matrices of distances into data
forvalues i=4(1)4 {
	global wave = `i'
	*
	use "tmp_x`i'.dta", clear
	gen wave =`i'
	sort ROW COL
	save "tmp_x`i'.dta", replace
	*
	if $wave == 2| $wave ==3 | $wave ==4 {
		use "tmp_eX`i'.dta", clear
		gen wave =`i'
		sort ROW COL
		save "tmp_eX`i'.dta", replace
	}
	*
	use "tmp_y`i'.dta", clear
	gen wave =`i'
	sort ROW COL
	save "tmp_y`i'.dta", replace
	*
	
	use "tmp_eY`i'.dta", clear
	gen wave =`i'
	sort ROW COL
	save "tmp_eY`i'.dta", replace
	*
	
	use "tmp_resy`i'.dta", clear
	gen wave =`i'
	sort ROW COL
	save "tmp_resy`i'.dta", replace
	*
	
	use "tmp_xt`i'.dta", clear
	gen wave =`i'
	sort ROW COL
	save "tmp_xt`i'.dta", replace
	*

	use "tmp_xi`i'.dta", clear
	gen wave =`i'
	sort ROW COL
	save "tmp_xi`i'.dta", replace
	*
	
	use "tmp_resyt`i'.dta", clear
	gen wave =`i'
	sort ROW COL
	save "tmp_resyt`i'.dta", replace
	*

	use "tmp_resyi`i'.dta", clear
	gen wave =`i'
	sort ROW COL
	save "tmp_resyi`i'.dta", replace
	*

}
*
use "tmp_x4.dta", clear
sort ROW COL
save "tmp_x.dta", replace
*
use "tmp_xt4.dta", clear
sort ROW COL
save "tmp_xt.dta", replace
*
use "tmp_xi4.dta", clear
sort ROW COL
save "tmp_xi.dta", replace
*
use "tmp_eX4.dta", clear
sort ROW COL
save "tmp_eX.dta", replace
*
use "tmp_y4.dta", clear
sort ROW COL
save "tmp_y.dta", replace
*
use "tmp_eY4.dta", clear
sort ROW COL
save "tmp_eY.dta", replace
*
use "tmp_resy4.dta", clear
sort ROW COL
save "tmp_resy.dta", replace
*
use "tmp_resyt4.dta", clear
sort ROW COL
save "tmp_resyt.dta", replace
*
use "tmp_resyi4.dta", clear
sort ROW COL
save "tmp_resyi.dta", replace
*
use "tmp_x.dta", clear
merge ROW COL using "tmp_eX.dta" 
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_y.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_eY.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_resy.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_xt.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_xi.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_resyt.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_resyi.dta"
tab _m
drop _m
*
label var cov_mdist "Dist. in Covariates"
label var Y_trust_mdist "Dist. in Trust"
label var Y_ideology_mdist "Dist. in Ideology"
label var ecov_mdist "Dist. in Ext. Covariates"
label var cult_mdist "Dist. in Culture"
label var ecult_mdist "Dist. in Ext. Culture"
label var res_cult_mdist "Dist. in Cultural residuals"
label var res_Y_trust_mdist "Dist. in Trust residuals"
label var res_Y_ideology_mdist "Dist. in Ideology residuals"

** Identifying countries
sort ROW
merge ROW using "tmp1r.dta"
keep if _m~=2
drop _m
sort COL
merge COL using "tmp1c.dta"
keep if _m~=2
drop _m
gen same = (country_row==country_col)
*Keeping unique correlations
* drop diagonal 
drop if ROW==COL
*keep only lower triangular
isid ROW COL
sort ROW COL
gen id1 = _n
sort COL ROW
gen id2 = _n 
rowsort id1 id2, gen(id3 id4)
keep if id1==id3&id2==id4
drop id1 id2 id3 id4
replace country_row = "GB" if country_row == "GB-GBN"
replace country_col = "GB" if country_col == "GB-GBN"
ssc install geodist, replace
geodist latitude_row longitude_row latitude_col  longitude_col, gen(region_dist)
geodist CapitalLatitude_row CapitalLongitude_row CapitalLatitude_col  CapitalLongitude_col, gen(country_dist)
label var region_dist "Dist. between regions"
label var country_dist "Dist. between countries"

save "data_to_use.dta", replace

* Figure 13
use "data_to_use.dta", clear
keep if wave==4

kdensity cult_mdist if same==1, addplot(kdensity cult_mdist if same==0, lw(medthick) lp(solid)) ///
legend(lab(1 "Same state") label(2 "Different states")) lw(medthick) /// 
lp(dash) title("Cultural distance") note("$labellist set of cultural variables. Wave 4.") 
graph export "section5/figure_13a.png", replace

kdensity res_cult_mdist if same==1, addplot(kdensity res_cult_mdist if same==0, lw(medthick) lp(solid)) ///
legend(lab(1 "Same state") label(2 "Different states")) lw(medthick) ///
lp(dash) title("Distance in residuals of culture") note("$labellist set of cultural variables. Wave 4.") 
graph export "section5/figure_13b.png", replace
***


********************************************************************************
** Back to Main Graphs
********************************************************************************

* Figure A.15
use "data_to_use.dta", clear
keep if wave==4
foreach x in 0 1 {
ivreg2 cult_mdist cov_mdist if same==`x', cluster(country_row country_col)
local b0 = round(_b[_cons],.001)
local b1 = round(_b[cov_mdist],.0001)
local b0 = string(`b0')
global regression`x' "y = 0`b0' + 0`b1' * x"
predict hat`x' 		if e(sample)
predict s`x' , stdp     
generate low`x' = hat`x' - 1.96*s`x'
generate hi`x'  = hat`x' + 1.96*s`x'
}
twoway (rarea low0 hi0 cov_mdist , sort color(gs14) ) (rarea low1 hi1 cov_mdist , sort color(gs14) )   ///
        (line hat0 cov_mdist, sort lc(black) ylab(0.5(0.02)0.62) text(0.57 0.5 "Diff. state:    $regression0", place(e))) (line hat1 cov_mdist, sort lc(black) lp(dash) text(0.58 0.5 "Same state: $regression1", place(e))) ///
	   , xtitle("Economic Distance") ytitle("Cultural Distance") title("Differences in economic vs cultural dimensions") subtitle("Within and cross state") ///
	   note("Full set of socioeconomic variables. Wave 4.") ///
	   legend(order(4 3) lab(4 "Same state") label(3 "Diff. state"))
qui graph export "appendix/figure_A15.png", width(3000) height(2000) replace
***

* Figure A.16
use "data_to_use.dta", clear
keep if wave==4
replace region_dist = region_dist / 10000

foreach x in 0 1 {
ivreg2 cult_mdist region_dist if same==`x', cluster(country_row country_col)
local b0 = round(_b[_cons],.001)
global regression`x'pos "y = 0`b0' + 0`b1' * x"
local b1 = -round(_b[region_dis],.001)
global regression`x' "y = 0`b0' - 0`b1' * x"

predict hat`x' 		if e(sample)
predict s`x' , stdp     
generate low`x' = hat`x' - 1.96*s`x'
generate hi`x'  = hat`x' + 1.96*s`x'
}
preserve
keep if same==1
twoway (rarea low1 hi1 region_dis , sort color(gs14) ) (line hat1 region_dis, sort lc(black) legend(off) lp(solid) /*xlab(0(0.02)0.1)*/ text(0.59 0.03 "$regression1pos")), ///
	    xtitle("Geographic Distance between counties (10k Kms)") ytitle("Cultural Distance") title("Within State")
qui graph save fig6_4a, replace
restore
twoway (rarea low0 hi0 region_dis , sort color(gs14) ) (line hat0 region_dis, sort lc(black) legend(off)  /*ylab(0.5(0.02)0.62)*/ text(0.59 0.2 "$regression0")),  ///
	    xtitle("Geographic Distance between counties (10k Kms)") ytitle("Cultural Distance") title("Cross State")
qui graph save fig6_4b, replace
graph combine fig6_4a.gph fig6_4b.gph, c(2) ycommon ysize(4.4) xsize(11) title("Differences in geographic vs cultural dimensions") note("Full set of socioeconomic variables. Wave 4.")
qui graph export "appendix/figure_A16.png", width(3000) as(png) replace
***


* Table A.9
use "data_to_use.dta", clear
keep if wave==4
global country4 "California Texas Illinois Michigan Florida Ohio North_Carolina Pennsylvania New_York"
 
global i = 1
mat A = J(9,10,0)
foreach countryi of global country4 {
	global j = 1
	foreach countryj of global country4 {
		
		qui reg cult_mdist cov_mdist if ((country_row=="`countryi'" & country_col=="`countryj'")|(country_col=="`countryi'" & country_row=="`countryj'"))
		sca a1 = _b[_cons]
		mat A[$i,$j] = a1
		*
		global j = $j+1
	}
	    qui reg cult_mdist cov_mdist if (country_row=="`countryi'" | country_col=="`countryi'") & same==0
		sca a2 = _b[_cons]
		mat A[$i, 10] = a2
	
	global i = $i+1
}
mat rown A = $country4
mat coln A = $country4
mata: A = st_matrix("A")

* Output graphs & tables
mata:
        dh = _docx_new()
        _docx_set_font(dh, "Garamond")
        _docx_set_size(dh, 15)
end

mata: e = _docx_paragraph_new(dh,"")
mata: e = _docx_paragraph_add_text(dh, "Table: Avg. cultural distance between row & column individuals of identical socioeconomic level expressed in socioeconomic distance units of the row country.")
mata: e = _docx_set_size(dh, 30)
mata: e = _docx_text_set_bold(dh, 1)
mata: e = _docx_paragraph_new(dh,"")
mata: e = _docx_set_size(dh, 15)
mata: e = _docx_add_matrix(dh, "A", "%10.2f", 1, 1)
mata: e = _docx_add_pagebreak(dh)

mata:
	_docx_save(dh, "appendix/table_A9.docx",1)
	_docx_close(dh)
end

saveold "data/data_to_use_US_states", replace


** Housekeeping
foreach file in tmp1c.dta tmp1r.dta tmp_x1.dta tmp_x2.dta tmp_x3.dta tmp_x4.dta ///
	tmp_y1.dta tmp_y2.dta tmp_y3.dta tmp_y4.dta tmp_eY1.dta tmp_eY2.dta         ///
	tmp_eY3.dta tmp_eY4.dta tmp_eX3.dta tmp_eX4.dta rtmp.dta tmp_eX2.dta        ///
	tmp_resy1.dta tmp_resy2.dta tmp_resy3.dta tmp_resy4.dta tmp0.dta 			///
	tmp_eY.dta  tmp_y.dta  tmp_eX.dta  tmp_x.dta  tmp_resy.dta fig6_4a.gph fig6_4b.gph  ///
	data_to_use.dta {
	capture erase `file'
}

